Setup

Load Libraries

library(readxl)
library(DT)
library(data.table)
library(dplyr)
library(ggplot2)
library(plotly)
library(cowplot)
library(curl)
library(biomaRt)
library(sqldf)
# Ensembl LD API
library(httr)
library(jsonlite)
library(xml2)  
library(gaston)
 
# *** susieR ****
# library(knitrBootstrap) #install_github('jimhester/knitrBootstrap')
library(susieR) # devtools::install_github("stephenslab/susieR") 

# *** finemapr ****
## finemapr contains: finemap, CAVIAR, and PAINTOR
# library(finemapr) # devtools::install_github("variani/finemapr")
library(roxygen2) #roxygenize()  

# *** locuscomparer ****
# https://github.com/boxiangliu/locuscomparer
library(locuscomparer); #devtools::install_github("boxiangliu/locuscomparer")

# thm <- knitr::knit_theme$get("bipolar")
# knitr::knit_theme$set(thm)

sessionInfo()
## R version 3.5.1 (2018-07-02)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS  10.14.2
## 
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] locuscomparer_1.0.0 roxygen2_6.1.1      susieR_0.6.2.0411  
##  [4] gaston_1.5.4        RcppParallel_4.4.2  Rcpp_1.0.0         
##  [7] xml2_1.2.0          jsonlite_1.6        httr_1.4.0         
## [10] sqldf_0.4-11        RSQLite_2.1.1       gsubfn_0.7         
## [13] proto_1.0.0         biomaRt_2.38.0      curl_3.3           
## [16] cowplot_0.9.4       plotly_4.8.0        ggplot2_3.1.0      
## [19] dplyr_0.7.8         data.table_1.12.0   DT_0.5.1           
## [22] readxl_1.2.0       
## 
## loaded via a namespace (and not attached):
##  [1] Biobase_2.42.0       tidyr_0.8.2          bit64_0.9-7         
##  [4] viridisLite_0.3.0    assertthat_0.2.0     expm_0.999-3        
##  [7] stats4_3.5.1         blob_1.1.1           cellranger_1.1.0    
## [10] yaml_2.2.0           progress_1.2.0       pillar_1.3.1        
## [13] lattice_0.20-38      glue_1.3.0           chron_2.3-53        
## [16] digest_0.6.18        colorspace_1.4-0     htmltools_0.3.6     
## [19] Matrix_1.2-15        plyr_1.8.4           XML_3.98-1.16       
## [22] pkgconfig_2.0.2      purrr_0.3.0          scales_1.0.0        
## [25] tibble_2.0.1         IRanges_2.16.0       withr_2.1.2         
## [28] BiocGenerics_0.28.0  lazyeval_0.2.1       magrittr_1.5        
## [31] crayon_1.3.4         memoise_1.1.0        evaluate_0.12       
## [34] tools_3.5.1          prettyunits_1.0.2    hms_0.4.2           
## [37] stringr_1.3.1        S4Vectors_0.20.1     munsell_0.5.0       
## [40] AnnotationDbi_1.44.0 bindrcpp_0.2.2       compiler_3.5.1      
## [43] rlang_0.3.1          grid_3.5.1           RCurl_1.95-4.11     
## [46] htmlwidgets_1.3      bitops_1.0-6         tcltk_3.5.1         
## [49] rmarkdown_1.11       gtable_0.2.0         DBI_1.0.0           
## [52] R6_2.3.0             knitr_1.21           bit_1.1-14          
## [55] bindr_0.1.1          commonmark_1.7       stringi_1.2.4       
## [58] parallel_3.5.1       tidyselect_0.2.5     xfun_0.4
print(paste("susieR ", packageVersion("susieR")))
## [1] "susieR  0.6.2.411"

General Functions

createDT <- function(DF, caption="", scrollY=400){
  data <- DT::datatable(DF, caption=caption,
    extensions = 'Buttons',
    options = list( dom = 'Bfrtip', 
                    buttons = c('copy', 'csv', 'excel', 'pdf', 'print'), 
                    scrollY = scrollY, scrollX=T, scrollCollapse = T, paging = F,  
                      columnDefs = list(list(className = 'dt-center', targets = "_all"))
    )
  ) 
   return(data)
}
createDT_html <- function(DF, caption="", scrollY=400){
  print( htmltools::tagList( createDT(DF, caption, scrollY)) ) 
} 

Data Preprocessing

TSS Data

Use bioMart to get TSS positions for each gene

get_TSS_position <- function(gene){ 
  mart <- useDataset("hsapiens_gene_ensembl", useMart("ensembl"))
  # att <- listAttributes(mart)
  # grep("transcription", att$name, value=TRUE)
  TSS <- getBM(mart=mart,
               attributes=c("hgnc_symbol","transcription_start_site", "version"), 
               filters="hgnc_symbol", values=gene)
} 

Import GWAS Summary Statistics

For each gene, get the position of top SNP (the one with the greatest effect size/Beta)

## Only the significant subset of results
Nalls_SS_sig <- read_excel("Data/Nalls2018_S2_SummaryStats.xlsx", sheet = "Data")
top_SNPs <- Nalls_SS_sig %>% group_by(`Nearest Gene`) %>% top_n(-1, `P, all studies`) #`Beta, all studies` 
top_SNPs <- cbind(Coord=paste(top_SNPs$CHR,top_SNPs$BP, sep=":"), top_SNPs)
createDT(Nalls_SS_sig, caption = "Nalls (2018): Table S2. Detailed summary statistics on all nominated risk variants, known and novel")
# Nall (2018) QTL Stats
# Nalls_QTL <- read_excel("Data/Nalls2018_S4_QTL-MR.xlsx") 
# createDT(Nalls_QTL, caption = "Nalls (2018) Table S4. Expanded summary statistics for QTL Mendelian randomization") 

Get Flanking SNPs

Get all genes surrounding the index SNP (default is 1Mb upstream + 1Mb downstream) Nalls et al. (2019) data: https://github.com/neurogenetics/meta5

get_flanking_SNPs <- function(gene, top_SNPs, bp_distance=1000000){ #1000000
  topSNP_sub <- subset(top_SNPs, `Nearest Gene`==gene)
  
  ## Full dataset  
  filePath <- "Data/META.PD.NALLS2014.PRS.tsv"# "Data/nallsEtal2019_no23andMe.tab.txt"
  # Get col names
  # strsplit( readLines(filePath, n = 2), "\t")   
  minPos <- topSNP_sub$BP - bp_distance
  maxPos <- topSNP_sub$BP + bp_distance
  
  geneSubset <- read.csv.sql(filePath, sep="\t", stringsAsFactors=F,
                           sql = paste('select * from file where CHR =',topSNP_sub$CHR,
                                       'AND POS BETWEEN', minPos, 'AND', maxPos)) 
  geneSubset <- geneSubset %>% dplyr::rename(SNP=MarkerName)
  return(geneSubset)
} 
# flankingSNPs <- get_flanking_SNPs("LRRK2", top_SNPs)

susieR

Gaston LD

gaston_LD <- function(gene, flankingSNPs){
  # Download portion of vcf from 1KG website
  region <- paste(flankingSNPs$CHR[1],":",min(flankingSNPs$POS),"-",max(flankingSNPs$POS), sep="")
  chrom <- flankingSNPs$CHR[1]
  vcf_URL <- paste("ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20130502/ALL.chr",chrom,".phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz",sep="")
  system(paste("tabix -fh ",vcf_URL ,region, "> subset.vcf")) 
  
  # Import w/ gaston and further subset
  bed.file <- read.vcf("subset.vcf") 
  bed <- select.snps(bed.file, id %in% flankingSNPs$SNP)
  rm(bed.file)
  
  # Calculate pairwise LD for all SNP combinations
  ld.x <- gaston::LD(bed, lim = c(1,ncol(bed)))  
  # LD plot
  limit <- 20
  LD.plot( ld.x[1:limit,1:limit], snp.positions = bed@snps$pos[1:limit])
  
  # Double check subsetting
  LD_matrix <- ld.x[row.names(ld.x) %in% flankingSNPs$SNP, colnames(ld.x) %in% flankingSNPs$SNP] 
  return(LD_matrix)
}
# LD_matrix <- gaston_LD("LRRK2",flankingSNPs) 

susieR Function

  • Other fine-mapping tools included in susieR
  • susieR::N3finemapping.CAVIAR()
  • susieR::N3finemapping.DAP()
  • susieR::N3finemapping.FINEMAP()

  • Statistical Terms:
  • posterior inclusion probability (PIP)
  • coefficient estimate (Beta)
  • Effect allele frequency (EAF)
  • The I^2 statistic describes the percentage of variation across studies that seems not to be due to chance.

susie_on_gene <- function(gene, top_SNPs){
  flankingSNPs <- get_flanking_SNPs(gene, top_SNPs)  
  ### Get LD matrix 
  LD_matrix <- gaston_LD(gene, flankingSNPs) 
  ## Turn LD matrix into positive semi-definite matrix
  # LD_matrix2 <- ifelse(matrixcalc::is.positive.semi.definite(LD_matrix), 
  #        LDmatrix,
  #        Matrix::nearPD(LD_matrix)$mat %>% as.matrix() )
  
  ## Subset summary stats to only include SNPs found in query 
  geneSubset <- subset(flankingSNPs, SNP %in% row.names(LD_matrix)) 
  b <- geneSubset$Effect
  se <- geneSubset$StdErr 
  
  # Run Susie
  fitted_bhat <- susie_bhat(bhat = b, shat = se,
                            R = LD_matrix, 
                            n = nrow(LD_matrix), 
                            var_y = var(b),
                            L = 1, # Must lower to 1 if entering only 3 SNPs
                            scaled_prior_variance = 0.1, 
                            estimate_residual_variance = TRUE, 
                            estimate_prior_variance = FALSE, 
                            standardize = TRUE) 
  # Format results
  geneSubset$Coord <- paste(geneSubset$CHR, geneSubset$POS, sep="")
  susieDF <- data.frame(SNP=names(fitted_bhat$X_column_scale_factors), PIP=fitted_bhat$pip) %>%
  base::merge(subset(geneSubset, select=c("SNP","Effect","P.value", "Coord")), by="SNP") 
  createDT_html(susieDF, paste("susieR Results: ", gene), scrollY = 200)  
  return(susieDF)
}  
# susieDF <- susie_on_gene("LRRK2", top_SNPs)

Plot susieR Results

before_after_plots <- function(susieDF){ 
  ## Before fine-mapping 
  geneSubset <- susieDF %>% arrange(desc(abs(Effect)))
  labelSNPs <- geneSubset[1:5,]
  
  before_plot <- ggplot(geneSubset, aes(x=Coord, y=Effect, label=SNP, color= -log(P.value) )) + 
    geom_hline(yintercept=0,alpha=.5, linetype=1, size=.5) +
    geom_point() + 
    geom_point(data=labelSNPs[1,], 
               pch=21, fill=NA, size=4, colour="red", stroke=1) +
    geom_segment(aes(xend=Coord, yend=0, color= -log(P.value)), alpha=.5) +
    geom_text(data=labelSNPs, aes(label=SNP), 
              vjust="inward",hjust="inward", nudge_y=.05, nudge_x=.25) +
    labs(title=paste(gene, "Before Fine Mapping",sep="\n"), y="Beta", x="BP Position",
         color="-log(p-value)\nAll Studies") +
    theme(plot.title = element_text(hjust = 0.5))
  
  ## After fine-mapping
  susieDF %>% arrange(desc(PIP))
  labelSNPs_PIP <- susieDF[1:5,]
  
  after_plot <- ggplot(susieDF, aes(x=Coord, y=PIP, label=SNP, color= -log(P.value) )) + 
    geom_hline(yintercept=0,alpha=.5, linetype=1, size=.5) +
    geom_point() +  
    geom_point(data=susieDF[susieDF$PIP == max(susieDF$PIP),], 
               pch=21, fill=NA, size=4, colour="green", stroke=1) +
    geom_segment(aes(xend=Coord, yend=0, color= -log(P.value))) +
    geom_text(data=labelSNPs_PIP, aes(label=SNP),
              vjust="inward",hjust="inward",nudge_x = .25) +
    labs(title=paste(gene, "After Fine Mapping",sep="\n"), y="PIP", x="BP Position",
         color="-log(p-value)\nAll Studies") +
    theme(plot.title = element_text(hjust = 0.5))#theme(legend.position="none") 
  
  plot_grid(before_plot, after_plot, nrow = 2) %>% print() 
  # susie_plot(fitted_bhat, y="PIP", b=b, add_bar = T)  
}
# before_after_plots(susieDF)

Fine Map with Summary Statistics

geneList <- c("LRRK2","GBAP1","SNCA","VPS13C","GCH1")#unique(top_SNPs$`Nearest Gene`) 
#Nalls_SS %>% group_by(`Nearest Gene`) %>% tally() %>% subset(n>2)

fineMapped_topSNPs <- data.table()
for (gene in geneList){ 
  cat("\n")
  cat("### ", gene, "\n")
  susieDF <- susie_on_gene(gene, top_SNPs)
  before_after_plots(susieDF)
  
  # Create summary table for all genes
  newEntry <- cbind(data.table(Gene=gene), subset(susieDF, PIP==max(PIP)) %>% as.data.table())
  fineMapped_topSNPs <- rbind(fineMapped_topSNPs, newEntry)
  cat("\n")
}

LRRK2

ped stats and snps stats have been set. ‘p’ has been set. ‘mu’ and ‘sigma’ have been set.

GBAP1

ped stats and snps stats have been set. ‘p’ has been set. ‘mu’ and ‘sigma’ have been set.

SNCA

ped stats and snps stats have been set. ‘p’ has been set. ‘mu’ and ‘sigma’ have been set.

VPS13C

ped stats and snps stats have been set. ‘p’ has been set. ‘mu’ and ‘sigma’ have been set.

GCH1

ped stats and snps stats have been set. ‘p’ has been set. ‘mu’ and ‘sigma’ have been set.

Top SNPs

createDT(fineMapped_topSNPs, "Potential Causal SNPs Identified by susieR", scrollY = 200)
## Warning in instance$preRenderHook(instance): It seems your data is too
## big for client-side DataTables. You may consider server-side processing:
## https://rstudio.github.io/DT/server.html